#-*- coding: UTF-8 -*- 

'''
Created on 2011-11-15

@author: lyletzzzw
'''
import httplib;
import re;
 
'''
抓取网页内容
@param realm:域名    19bt.info
@param uri:具体的uri页面   /list.php?p=1&area=2,2&readfor=
'''
def getHtml(realm,uri):
    result = "";
   
    conn = httplib.HTTPConnection(realm);
    conn.request("GET", uri);
    r1 = conn.getresponse();
    if r1.status == 200:
        result = r1.read();
    else:
        result = getHtml(realm,uri);
    #print r1.status, r1.reason;
    #print("\n"+result);
    conn.close();
        
    return result;

'''
搜索匹配词
@param realm:域名
@param patterStr:搜索词
@param contextStr:搜索内容   
'''
def search(realm,patternStr,contextStr):
    list = [];
    ms = re.finditer(patternStr, contextStr)
    for item in ms:
        str = item.group(1)+realm+item.group(2)+item.group(3)+item.group(4)+item.group(5)+"</br>"
        list.append(str);
    #print list.__len__();
    return list;

'''
主方法：执行入口
@param realm:域名
@param uri:内容页uri {/list.php?p=[]&area=2,2&readfor=}[]为匹配模式截断
@param pattern:搜索词
@param pageSize:页数  
'''
def execute(realm,uri,pattern,pageSize):
    htmlLists = [];
    list = re.split("\[\]", uri)
    for i in range(1,pageSize):
        newUri = list[0]+str(i)+list[1];
        resultHtml = getHtml(realm,newUri)
        htmlList = search("http://"+realm+"/",pattern,resultHtml);
        htmlLists = htmlLists+htmlList;
        print("第"+str(i)+"页,"+str(resultHtml.__len__())+"个字符，找到"+str(htmlList.__len__())+"个!");
        
    for item in htmlLists:
        print item

execute("19bt.info", "/list.php?p=[]&area=2,2&readfor=","(<a href=\")(.+)(\" target=\"_blank\">)(.*美腿.*)(</a>)", 318)
